Load libraries
library(tidyverse)
library(ggmap)
library(leaflet)
library(lubridate)
library(ggplot2)
library(leaflet)
Load MTA data & Google Transit static files
stations <- read_csv('http://web.mta.info/developers/data/nyct/subway/Stations.csv')
stops <- read_csv('../../data/google_transit_subway_static/stops.txt')
routes <- read_csv('../../data/google_transit_subway_static/routes.txt')
trips <- read_csv('../../data/google_transit_subway_static/trips.txt')
stop_times <- read_csv('../../data/google_transit_subway_static/stop_times.txt')
Make things easier later on
# for compatibility when joining
trips$route_id <- as.character(trips$route_id)
# take care of NA color values
routes$route_color <- replace_na(routes$route_color, "000000")
Maybe there’s some useful information in these dfs?
connections <- stop_times %>%
filter(!is.na(arrival_time)) %>%
left_join(stops) %>%
extract(trip_id, c("route_id"), regex=".*_.*_(.*)\\.\\..*", remove=FALSE) %>%
extract(trip_id, c("day_of_week"), regex=".*-.*-(.*)-.*", remove=FALSE) %>%
extract(trip_id, c("time"), regex=".*-.*-.*-.*_(.*)_.*\\.\\..*", remove=FALSE) %>%
mutate(stop_id = substr(stop_id, 1, 3),
prev_stop_id = ifelse(trip_id == lag(trip_id), lag(stop_id), NA),
prev_stop_lat = ifelse(trip_id == lag(trip_id), lag(stop_lat), NA),
prev_stop_lon = ifelse(trip_id == lag(trip_id), lag(stop_lon), NA),
prev_stop_name = ifelse(trip_id == lag(trip_id), lag(stop_name), NA))
## Joining, by = "stop_id"
### if we need don't care about previous stop info
# connections <- stop_times %>%
# filter(!is.na(arrival_time)) %>%
# left_join(stops) %>%
# extract(trip_id, c("route_id"), regex=".*_.*_(.*)\\.\\..*", remove=FALSE) %>%
# extract(trip_id, c("day_of_week"), regex=".*-.*-(.*)-.*", remove=FALSE) %>%
# extract(trip_id, c("time"), regex=".*-.*-.*-.*_(.*)_.*\\.\\..*", remove=FALSE) %>%
# mutate(stop_id = substr(stop_id, 1, 3)) %>%
# select(trip_id, time, day_of_week, route_id, arrival_time, departure_time, stop_id, stop_sequence, stop_name)
# add in the converted trip start time and route colors
with_times <- connections %>%
mutate(trip_start_time = seconds_to_period(as.numeric(time)*.6),
trip_start_time = as.POSIXct(sprintf("%s:%s:%s",
hour(trip_start_time), minute(trip_start_time), second(trip_start_time)),
"%H:%M:%S", tz="America/New_York")) %>%
left_join(routes %>% select(route_id, route_color)) %>%
mutate(route_color = sprintf("#%s", route_color))
## Joining, by = "route_id"
# late night service is generally midnight-6am, but let's widen the window to 10pm-6am
# indicate whether or not the trip takes place on a weekend or weekday
# get direction: 1 = S, 0 = N
sequences <- with_times %>%
mutate(late_night = ifelse(hour(trip_start_time) >= 22 | hour(trip_start_time) <= 6, 1, 0),
is_weekend = ifelse(day_of_week == "Saturday" | day_of_week == "Sunday", 1, 0)) %>%
left_join(trips) %>%
select(route_id, trip_id, direction_id, day_of_week, trip_start_time, arrival_time, departure_time,
stop_id, stop_name, late_night, is_weekend, stop_lat, stop_lon, prev_stop_id, prev_stop_name,
prev_stop_lat, prev_stop_lon, route_color)
## Joining, by = c("trip_id", "route_id")
### if we don't care about previous stop info
# sequences <- with_times %>%
# mutate(late_night = ifelse(hour(trip_start_time) >= 22 | hour(trip_start_time) <= 6, 1, 0),
# is_weekend = ifelse(day_of_week == "Saturday" | day_of_week == "Sunday", 1, 0)) %>%
# left_join(trips) %>%
# select(route_id, trip_id, direction_id, day_of_week, trip_start_time, arrival_time, departure_time,
# stop_id, stop_name, late_night, is_weekend, route_color)
# just the distinct trip ids and a list of the corresponding stops
paths <- sequences %>% select(trip_id, stop_id) %>%
group_by(trip_id) %>%
mutate(path = paste0(stop_id, sep= ",", collapse = " ")) %>%
select(trip_id, path) %>%
distinct
# for distinct each trip, the route, id, day, time
trips_times <- sequences %>%
select(route_id, trip_id, direction_id, day_of_week, trip_start_time, late_night, is_weekend) %>%
distinct
# number of stops visited for each distinct trip id
counts <- sequences %>%
group_by(route_id, trip_id, trip_start_time) %>%
summarize(num_stops = n())
# number of stops visited, in descending order, by trip id, and hour of trip start time
ordered_counts <- counts[order(counts$route_id, counts$num_stops),] %>%
mutate(hour = as.integer(substr(trip_start_time, 1, 2)))
# put it all together
paths_info <- paths %>%
left_join(trips_times) %>%
left_join(counts)
## Joining, by = "trip_id"
## Joining, by = c("trip_id", "route_id", "trip_start_time")
# only the unique routes
distinct_routes <- paths_info %>%
ungroup() %>%
group_by(route_id, path, direction_id, day_of_week, is_weekend, late_night, num_stops) %>%
summarize(count=n())
Visualize different trips
# a more concise version of the with_times df
map_data <- with_times %>% select(route_id, trip_id, day_of_week, trip_start_time, arrival_time, departure_time,
stop_id, stop_name, stop_lat, stop_lon, prev_stop_id, prev_stop_name, prev_stop_lat, prev_stop_lon, route_color)
# change the route, day = {Saturday, Sunday, Weekday}, and a specific time
# hour and minute must directly coincide with a scheduled trip start time
get_scheduled_map <- function(df, route, day, time){
time <- as.POSIXct(time, format="%H:%M:%S")
new_interval <- (time - minutes(30)) %--% (time + minutes(30))
selected_route <- df %>%
filter(route_id == route, day_of_week == day,
trip_start_time %within% new_interval)
# View(selected_route)
nyc_map <- get_map(location = c(lon = -73.9568247, lat = 40.7202688), maptype = "terrain", zoom = 11)
ggmap(nyc_map) +
geom_point(data = selected_route, aes(x = stop_lon, y = stop_lat), color = selected_route$route_color) +
geom_segment(data = selected_route, aes(x=prev_stop_lon, y=prev_stop_lat, xend=stop_lon, yend=stop_lat), color = selected_route$route_color)
}
The 2 train around 9am on a weekday (express)
get_scheduled_map(map_data, "B", "Weekday", "12:10:00")
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=40.720269,-73.956825&zoom=11&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
## Warning: Removed 12 rows containing missing values (geom_segment).

The 2 train around 4am on a weekday (local)
get_scheduled_map(map_data, "2", "Weekday", "04:24:00")
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=40.720269,-73.956825&zoom=11&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
## Warning: Removed 32 rows containing missing values (geom_point).
## Warning: Removed 40 rows containing missing values (geom_segment).

Map all routes
# show all routes at a given day/time
get_scheduled_map_all <- function(df, day, time){
time <- as.POSIXct(time, format="%H:%M:%S")
new_interval <- (time - minutes(30)) %--% (time + minutes(30))
selected_route <- df %>%
filter(day_of_week == day, trip_start_time %within% new_interval)
# View(selected_route)
nyc_map <- get_map(location = c(lon = -73.9568247, lat = 40.7202688), maptype = "terrain", zoom = 11)
ggmap(nyc_map) +
geom_point(data = selected_route, aes(x = stop_lon, y = stop_lat), color = selected_route$route_color) +
geom_segment(data = selected_route, aes(x=prev_stop_lon, y=prev_stop_lat, xend=stop_lon, yend=stop_lat), color = selected_route$route_color)
}
All trains from 12am to 1am on a Weekday
get_scheduled_map_all(map_data, "Weekday", "12:30:00")
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=40.720269,-73.956825&zoom=11&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
## Warning: Removed 132 rows containing missing values (geom_point).
## Warning: Removed 472 rows containing missing values (geom_segment).

Do it in leaflet
get_scheduled_map_leaflet <- function(df, day, time){
time <- as.POSIXct(time, format="%H:%M:%S")
new_interval <- (time - minutes(30)) %--% (time + minutes(30))
selected_route <- df %>%
filter(day_of_week == day, trip_start_time %within% new_interval)
# View(selected_route)
selected_route$stop_lat <- jitter(selected_route$stop_lat, factor = 3)
selected_route$stop_lon <- jitter(selected_route$stop_lon , factor= 3)
map <- leaflet() %>%
addTiles() %>%
setView(-74.00, 40.71, zoom = 12) %>%
addProviderTiles("CartoDB.Positron") %>%
addCircleMarkers(selected_route$stop_lon, selected_route$stop_lat, color = selected_route$route_color,
popup = paste(selected_route$stop_name, "<br/>", selected_route$route_id),
radius = 4)
for (i in 1:nrow(selected_route)) {
map <- map %>%
addPolylines(lat = c(selected_route[i,]$stop_lat, selected_route[i,]$prev_stop_lat),
lng = c(selected_route[i,]$stop_lon, selected_route[i,]$prev_stop_lon),
color = selected_route[i,]$route_color,
weight = 1)
}
map
}
All trains 12am-1am on a Weekday
get_scheduled_map_leaflet(map_data, "Weekday", "12:30:00")